1. Import the data and the packages

1.1 Combine and clean the data

## Filter useful variables and remove duplicate events
athletes1 <- filter(athletes, Season != "Summer") %>% 
  select(NOC, Team, Sex, Year, Sport, Event, Medal) %>%
  distinct()

## Group by countries and calculate medals and games competing in
Events <- athletes1 %>%
  group_by(NOC, Medal) %>%
  summarise(total = n())
## `summarise()` has grouped output by 'NOC'. You can override using the `.groups` argument.
Medal <- spread(
  Events,
  key = "Medal",
  value = "total")
colnames(Medal)[5] <- "Participate"
Medal[is.na(Medal)] <- 0
Medal <- mutate(Medal, Total =  Gold + Silver + Bronze + Participate,
                TotalMedal = Gold + Silver + Bronze) %>%
  arrange(desc(Gold))
##  Every country with different code are calculated together in sports analysis
## Sum up the same countries in different code and append gdp in gdp analysis. Reserve only those who have names in gdppop data
Medal <- right_join(noc, Medal, by = "NOC")
country <- aggregate(cbind(Gold, Silver, Bronze, TotalMedal) ~ region, data = Medal, sum)  %>%
  arrange(desc(TotalMedal))

1.2 Over time comparation and Medals comparation

## Over Time table creation, the plot including 10 moat successful countries in Winter Olympics
Time <- athletes1 %>%
  group_by(NOC, Year, Medal) %>%
  summarise(total = n())
## `summarise()` has grouped output by 'NOC', 'Year'. You can override using the `.groups` argument.
Time <- right_join(noc, Time, by = "NOC")
Time <- spread(
  Time,
  key = "Medal",
  value = "total")
Time <- Time[, -3]
Time[is.na(Time)] <- 0
Time <- mutate(Time,
              Total = Gold + Silver + Bronze) %>%
  arrange(desc(Year),desc(Gold))
top10 <- country[1:10, ]
top10overtime <- Time[Time$region %in% top10$region, ]
## Picture here with top10 countries 
library(ggthemes)
top10hc <- hchart(top10overtime, "scatter", 
       hcaes(x = Year, 
             y = Total, 
             group = region))
top10hc
Sex <- athletes1 %>%
  filter(Medal != "Na") %>%
  group_by(NOC, Sex) %>%
  summarise(total = n())
## `summarise()` has grouped output by 'NOC'. You can override using the `.groups` argument.
Sex <- spread(
  Sex,
  key = "Sex",
  value = "total")

Medal <- left_join(Medal, Sex, by = "NOC")
Medal$F[is.na(Medal$F)] <- 0
Medal$M[is.na(Medal$M)] <- 0
Medal <- aggregate(cbind(Gold, Silver, Bronze, TotalMedal, Participate, Total, F, M) ~ region, data = Medal, sum)  %>%
  arrange(desc(TotalMedal))
Medalsex <- mutate(Medal, genderratio = F/M)
Medalsex <- Medalsex[1:10, ]

head(Medal, 10) %>% slice
##         region Gold Silver Bronze TotalMedal Participate Total   F   M
## 1      Germany  139    138    113        390         988  1378 193 197
## 2       Russia  156    117    108        381         752  1133 176 205
## 3       Norway  112    108    100        320         627   947  59 261
## 4          USA   97    104     89        290        1024  1314 125 165
## 5      Austria   61     80     82        223         649   872  75 148
## 6       Canada   65     60     56        181         864  1045  88  93
## 7      Finland   42     63     57        162         470   632  42 120
## 8       Sweden   50     40     54        144         587   731  38 106
## 9  Switzerland   50     40     47        137         583   720  40  97
## 10       Italy   37     34     45        116         788   904  41  75
topF <- Medal %>%
  select(region, F) %>%
  arrange(desc(F))
head(topF, 10) %>% slice
##     region   F
## 1  Germany 193
## 2   Russia 176
## 3      USA 125
## 4   Canada  88
## 5  Austria  75
## 6   Norway  59
## 7   France  44
## 8  Finland  42
## 9    Italy  41
## 10   China  41
topM <- Medal %>%
  select(region, M) %>%
  arrange(desc(M))
head(topM, 10) %>% slice
##         region   M
## 1       Norway 261
## 2       Russia 205
## 3      Germany 197
## 4          USA 165
## 5      Austria 148
## 6      Finland 120
## 7       Sweden 106
## 8  Switzerland  97
## 9       Canada  93
## 10       Italy  75
## The aggregated medal rank with a gender ratio perspective.
## The ratio closer to 1, the more equal for female and male to gain a medal in this country.
Medalcount <- ggplot(Medalsex, aes(x = TotalMedal, y = reorder(region, TotalMedal), size = Total)) + 
  geom_point(aes(color = genderratio)) + 
  labs(x="TotalMedal", y="region")
Medalcount

## Another pie chart of medals gained by top10 countries
Medalcount2 <- ggplot(Medalsex, 
                     aes(x = "", y = TotalMedal, fill = region)
             ) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) + 
  theme_void()
Medalcount2

  1. GDP
gdp <- right_join(noc, Medal, by = "region")
gdp <- gdp[-which(gdp$notes =="Hong Kong"), ]
gdp <- inner_join(gdp, gdppop, by = c("NOC" = "Code"))
gdp <- mutate(gdp, medal_gdp = 10^3 * (Gold * 3 + Silver *2 + Bronze) / gdp$'GDP per Capita', 
              medal_pop = 10^6 * (Gold * 3 + Silver *2 + Bronze) / Population)
medal_gdp <- select(gdp, region, TotalMedal, medal_gdp) %>%
  arrange(desc(medal_gdp))
slice(medal_gdp)
## # A tibble: 107 x 3
##    region  TotalMedal medal_gdp
##    <chr>        <dbl>     <dbl>
##  1 Russia         381     89.1 
##  2 Germany        390     19.5 
##  3 China           58     13.5 
##  4 USA            290     10.5 
##  5 Austria        223      9.71
##  6 Norway         320      8.76
##  7 Canada         181      8.58
##  8 Italy          116      7.48
##  9 Finland        162      7.30
## 10 Ukraine          8      6.15
## # ... with 97 more rows
medal_pop <- select(gdp, region, TotalMedal, medal_pop) %>%
  arrange(desc(medal_pop))
slice(medal_pop)
## # A tibble: 107 x 3
##    region        TotalMedal medal_pop
##    <chr>              <dbl>     <dbl>
##  1 Liechtenstein          9     400. 
##  2 Norway               320     125. 
##  3 Finland              162      56.4
##  4 Austria              223      49.4
##  5 Switzerland          137      33.4
##  6 Sweden               144      29.0
##  7 Netherlands          110      13.1
##  8 Estonia                7      13.0
##  9 Slovenia              15      11.1
## 10 Canada               181      10.3
## # ... with 97 more rows
## Remove some tiny countries with <= 20 medals
medal_gdp <- medal_gdp %>%
  filter(TotalMedal > 20)
medal_pop <- medal_pop %>%
  filter(TotalMedal > 20)
gdpplot <- ggplot(medal_gdp, aes(x=reorder(region, medal_gdp), y = medal_gdp, fill = region, text = paste("Total:", TotalMedal))) +
  geom_bar(stat="identity")  + 
  coord_flip() + 
  xlab("region") + 
  ggtitle("Great Power Competition",
  subtitle = "Calculate by GDP per capita * 10^3")
ggplotly(gdpplot)
popplot <- ggplot(medal_pop, aes(x=reorder(region, medal_pop), y = medal_pop, fill = region, text = paste("Total:", TotalMedal))) +
  geom_bar(stat="identity")  + 
  coord_flip() + 
  xlab("region") + 
  ggtitle("Advantage of King in the North", 
  subtitle = "Calculate by population * 10^6")
ggplotly(popplot)

3.Host Country Advantage

wiki_hosts <- read_html("https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[2]], fill=TRUE)[-1]
hosts <- hosts %>% filter(Winter != "") %>%
  select(City, Country, Year)

## Modify over time df to gain meanmedals of every country
Time <- Time[, -7]
general <- Time %>% 
  select(region, Gold, Total) %>%
  group_by(region) %>%
  summarize(meanmedal = mean(Total), meangold = mean(Gold)) %>%
  arrange(desc(meanmedal))
slice(general)
## # A tibble: 111 x 3
##    region      meanmedal meangold
##    <chr>           <dbl>    <dbl>
##  1 Russia          23.8      9.75
##  2 Germany         15        5.35
##  3 Norway          14.5      5.09
##  4 USA             13.2      4.41
##  5 Austria         10.1      2.77
##  6 Canada           8.23     2.95
##  7 Finland          7.36     1.91
##  8 Sweden           6.55     2.27
##  9 Switzerland      6.23     2.27
## 10 Netherlands      5.5      1.85
## # ... with 101 more rows
##Modify host countries official names and append meanmedals
hosts$Country[hosts$Country == "United States"] <- "USA"
hosts$Country[hosts$Country == "Russia[h]"] <- "Russia"
hosts$Country[hosts$Country == "Yugoslavia"] <- "Serbia"
hostmedal <- merge(hosts, Time, by.x = c("Country", "Year"), by.y = c("region", "Year"))
hostmedal <-  hostmedal %>% 
  select(Year, Country, Gold, Total) 
hostmedal <- merge(hostmedal, general, by.x = "Country", by.y = "region")
hostmedal <- mutate(hostmedal, hostadvantage = (Total - meanmedal) * 0.7 + (Gold - meangold) * 0.3) %>%
  arrange(desc(hostadvantage))
slice(hostmedal)
##        Country Year Gold Total  meanmedal meangold hostadvantage
## 1       Canada 2010   15    27  8.2272727 2.954545    16.7545455
## 2          USA 2002   10    34 13.1818182 4.409091    16.2500000
## 3       Russia 2014   15    38 23.8125000 9.750000    11.5062500
## 4       Norway 1994    8    23 14.5454545 5.090909     6.7909091
## 5        Japan 1998    5    10  2.2500000 0.500000     6.7750000
## 6        Italy 2006    5    11  5.2727273 1.681818     5.0045455
## 7       France 1992    3    10  5.2272727 1.545455     3.7772727
## 8       France 1968    4     9  5.2272727 1.545455     3.3772727
## 9  Switzerland 1948    3     9  6.2272727 2.272727     2.1590909
## 10     Austria 1964    4    12 10.1363636 2.772727     1.6727273
## 11      Norway 1952    7    16 14.5454545 5.090909     1.5909091
## 12       Japan 1972    1     3  2.2500000 0.500000     0.6750000
## 13      Serbia 1984    0     1  0.2105263 0.000000     0.5526316
## 14         USA 1932    6    13 13.1818182 4.409091     0.3500000
## 15         USA 1980    6    12 13.1818182 4.409091    -0.3500000
## 16      France 1924    0     4  5.2272727 1.545455    -1.3227273
## 17       Italy 1956    1     3  5.2727273 1.681818    -1.7954545
## 18         USA 1960    3    11 13.1818182 4.409091    -1.9500000
## 19      Canada 1988    0     6  8.2272727 2.954545    -2.4454545
## 20     Austria 1976    2     6 10.1363636 2.772727    -3.1272727
## 21 Switzerland 1928    0     1  6.2272727 2.272727    -4.3409091
## 22     Germany 1936    4     7 15.0000000 5.346154    -6.0038462
## Standard of hostadvantage = (Total - meanmedal) * 0.7 + (Gold - meangold) * 0.3
## Total = Medals gained during host Olympics this time
## Gold = Gold medals gained during host Olympics this time
## meanmedal =  Medals gained during host Olympics in average
## meangold = Gold medals gained during host Olympics in average
## Based on hostadvantage to create a plotly plot
hostplot <- hostmedal %>%
  plot_ly(x = ~Year, y = ~hostadvantage) %>%
  add_lines() %>%
  layout(title = "Hostadvantage Time Series")
hostplot <- add_markers(hostplot, size = ~Total, color = ~Country)

hostplot
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
  1. Most successful athletes
athletes2 <- filter(athletes, Season != "Summer") %>%
              group_by(Name, Medal) %>%
              summarise(total = n())
## `summarise()` has grouped output by 'Name'. You can override using the `.groups` argument.
success <- spread(athletes2, key = "Medal", value = "total") %>%
             arrange(desc(Gold))
success[is.na(success)] <- 0  
success <- mutate(success, total = Bronze + Silver + Gold, 
                   grade = Gold * 3 + Silver *2 + Bronze)%>%
  arrange(desc(grade))

Body <- filter(athletes, Season != "Summer") %>%
  select(Name, Sex, Height, Weight, Team) %>%
  group_by(Name, Sex, Team) %>%
  summarise(height = mean(Height), 
            weight = mean(Weight))
## `summarise()` has grouped output by 'Name', 'Sex'. You can override using the `.groups` argument.
Body <- distinct(Body, Name, .keep_all = TRUE)
success <- inner_join(success, Body, by = "Name")
slice(success)
## # A tibble: 18,925 x 11
## # Groups:   Name [18,923]
##    Name        Bronze  Gold Silver `<NA>` total grade Sex   Team   height weight
##    <chr>        <int> <int>  <int>  <int> <int> <dbl> <chr> <chr>   <dbl>  <dbl>
##  1 "Ole Einar~      1     8      4     14    13    33 M     Norway   178    65  
##  2 "Marit Bjr~      1     6      3      9    10    25 F     Norway   168    64  
##  3 "Lyubov Iv~      0     6      3      3     9    24 F     Russia   167    58  
##  4 "Raisa Pet~      1     4      5      5    10    23 F     Sovie~   162    53  
##  5 "Claudia P~      2     5      2      6     9    21 F     Germa~   166    61  
##  6 "Viktor An"      2     6      0      2     8    20 M     Russia   170    65  
##  7 "Edy Sixte~      2     4      3      3     9    20 M     Sweden   177    72  
##  8 "Yang Yang"      2     2      6      9    10    20 F     China    166.   58.9
##  9 "Irene Kar~      1     4      3      5     8    19 F     Nethe~   168    65  
## 10 "Ricco Gro"      1     4      3      7     8    19 M     Germa~   179    75  
## # ... with 18,915 more rows
## standard of success = Gold * 3 + Silver *2 + Bronze
bodyplot <- success %>%
  filter(grade >= 10) %>%
  plot_ly(x = ~weight, y = ~height, type = "scatter", mode = "markers", color = ~Sex, size = ~grade, text = ~paste('Name: ', Name))

bodyplot
## Warning: Ignoring 3 observations
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

6.datatable

library(DT)
## Medaldatatable has been ranged by Totalmedals while creating
## Countries who at least have  one particiption has been calculated
## Of course, sports is not just medals, the participation times and medals gained grouped by gender are listed in the datatable.
Medal[is.na(Medal)] <- 0
Medaldatatable <- aggregate(cbind(Gold, Silver, Bronze, TotalMedal, Participate, Total, F, M) ~ region, data = Medal, sum)  %>%
  arrange(desc(TotalMedal))
datatable(Medaldatatable,
          caption = "Table1: The aggregated medals every country has won over time",
          filter = "top") %>%
      formatStyle('region',  color = 'white', 
                backgroundColor = 'red', fontWeight = 'bold')